resultsPath=file.path(getwd(),"Results")
# Gather parameters from command line
#dir.create(file.path(resultsPath,"cache"), showWarnings=F, recursive=T)
nCores <- parallel::detectCores()#params$nCores
subsetGenes <- params$subsetGenes
subsetCells <- params$subsetCells
resolution <- as.numeric(params$resolution)
interactive <- params$interactive
root <- getwd()
# Have to setwd via knitr
# knitr::opts_knit$set(root.dir=resultsPath, child.path = resultsPath)
knitr::opts_chunk$set(echo=T, error=T, root.dir = resultsPath
# cache=T, cache.lazy=T
)
# Utilize parallel processing later on
cat(paste("**** Utilized Cores **** =", nCores)) ## **** Utilized Cores **** = 12
## $subsetGenes
## [1] "protein_coding"
##
## $subsetCells
## [1] "F"
##
## $resolution
## [1] "0.4"
##
## $resultsPath
## [1] "/sc/orga/projects/pd-omics/brian/PD_scRNAseq/Results/subsetGenes-protein_coding__subsetCells-F__Resolution-0.4"
##
## $interactive
## [1] "F"
** /sc/orga/projects/pd-omics/brian/PD_scRNAseq/Results/subsetGenes-protein_coding__subsetCells-F__Resolution-0.4 **
## Error in library(ulimit): there is no package called 'ulimit'
## Error in library(benchmarkme): there is no package called 'benchmarkme'
library(gridExtra)
library(knitr)
library(plotly)
library(ggplot2)
library(viridis)
library(reshape2)
library(shiny)
library(ggrepel)
library(DT)
library(ComplexHeatmap); #BiocManager::install("ComplexHeatmap")
# install.packages('devtools')
# devtools::install_github('talgalili/heatmaply')
## Install Bioconductor
# if (!requireNamespace("BiocManager"))
# install.packages("BiocManager")
# BiocManager::install(c("biomaRt"))
library(biomaRt)
# BiocManager::install(c("DESeq2"))
library(DESeq2)
# library(snow); #BiocManager::install("Rmpi") #NOTE: different lib name than install name (snow vs Rmpi)
createDT <- function(DF, caption="", scrollY=500){
data <- DT::datatable(DF, caption=caption,
extensions = 'Buttons',
options = list( dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = F,
columnDefs = list(list(className = 'dt-center', targets = "_all"))
)
)
return(data)
}
# Useful Seurat functions
## Seurat::FindGeneTerms() # Enrichr API
## Seurat::MultiModal_CCA() # Integrates data from disparate datasets (CIA version too)
sessionInfo()## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS release 6.9 (Final)
##
## Matrix products: default
## BLAS/LAPACK: /hpc/packages/minerva-common/intel/parallel_studio_xe_2018/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
##
## locale:
## [1] C
##
## attached base packages:
## [1] parallel stats4 grid stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] DESeq2_1.20.0 SummarizedExperiment_1.10.1
## [3] DelayedArray_0.6.1 BiocParallel_1.14.2
## [5] matrixStats_0.53.1 Biobase_2.40.0
## [7] GenomicRanges_1.32.4 GenomeInfoDb_1.16.0
## [9] IRanges_2.14.10 S4Vectors_0.18.3
## [11] BiocGenerics_0.26.0 biomaRt_2.36.1
## [13] ComplexHeatmap_1.18.0 DT_0.4
## [15] ggrepel_0.8.0 shiny_1.1.0
## [17] reshape2_1.4.3 viridis_0.5.1
## [19] viridisLite_0.3.0 plotly_4.7.1
## [21] knitr_1.20 gridExtra_2.3
## [23] dplyr_0.7.6 Seurat_2.3.4
## [25] Matrix_1.2-14 cowplot_0.9.3
## [27] ggplot2_3.0.0
##
## loaded via a namespace (and not attached):
## [1] snow_0.4-3 backports_1.1.2 circlize_0.4.4
## [4] Hmisc_4.1-1 plyr_1.8.4 igraph_1.2.1
## [7] lazyeval_0.2.1 splines_3.5.1 digest_0.6.17
## [10] foreach_1.4.4 htmltools_0.3.6 lars_1.2
## [13] gdata_2.18.0 magrittr_1.5 checkmate_1.8.5
## [16] memoise_1.1.0 cluster_2.0.7-1 mixtools_1.1.0
## [19] ROCR_1.0-7 annotate_1.58.0 R.utils_2.7.0
## [22] prettyunits_1.0.2 colorspace_1.3-2 blob_1.1.1
## [25] crayon_1.3.4 RCurl_1.95-4.11 jsonlite_1.5
## [28] genefilter_1.62.0 bindr_0.1.1 survival_2.42-6
## [31] zoo_1.8-3 iterators_1.0.10 ape_5.1
## [34] glue_1.3.0 gtable_0.2.0 zlibbioc_1.26.0
## [37] XVector_0.20.0 GetoptLong_0.1.7 kernlab_0.9-26
## [40] shape_1.4.4 prabclus_2.2-6 DEoptimR_1.0-8
## [43] scales_0.5.0 mvtnorm_1.0-8 DBI_1.0.0
## [46] Rcpp_1.0.0 metap_0.9 dtw_1.20-1
## [49] xtable_1.8-2 progress_1.2.0 htmlTable_1.12
## [52] reticulate_1.7 foreign_0.8-70 bit_1.1-14
## [55] proxy_0.4-22 mclust_5.4.1 SDMTools_1.1-221
## [58] Formula_1.2-3 tsne_0.1-3 htmlwidgets_1.2
## [61] httr_1.3.1 gplots_3.0.1 RColorBrewer_1.1-2
## [64] fpc_2.1-11 acepack_1.4.1 modeltools_0.2-22
## [67] ica_1.0-2 pkgconfig_2.0.2 XML_3.98-1.12
## [70] R.methodsS3_1.7.1 flexmix_2.3-14 nnet_7.3-12
## [73] locfit_1.5-9.1 tidyselect_0.2.4 rlang_0.2.1
## [76] later_0.7.3 AnnotationDbi_1.42.1 munsell_0.5.0
## [79] tools_3.5.1 RSQLite_2.1.1 ggridges_0.5.0
## [82] evaluate_0.11 stringr_1.3.1 yaml_2.1.19
## [85] bit64_0.9-7 fitdistrplus_1.0-9 robustbase_0.93-1.1
## [88] caTools_1.17.1 purrr_0.2.5 RANN_2.6
## [91] bindrcpp_0.2.2 pbapply_1.3-4 nlme_3.1-137
## [94] mime_0.5 R.oo_1.22.0 hdf5r_1.0.0
## [97] compiler_3.5.1 rstudioapi_0.7 png_0.1-7
## [100] geneplotter_1.58.0 tibble_1.4.2 stringi_1.2.4
## [103] lattice_0.20-35 trimcluster_0.1-2.1 pillar_1.3.0
## [106] lmtest_0.9-36 GlobalOptions_0.1.0 data.table_1.11.8
## [109] bitops_1.0-6 irlba_2.3.2 httpuv_1.4.5
## [112] R6_2.3.0 latticeExtra_0.6-28 promises_1.0.1
## [115] KernSmooth_2.23-15 codetools_0.2-15 MASS_7.3-50
## [118] gtools_3.8.1 assertthat_0.2.0 rprojroot_1.3-2
## [121] rjson_0.2.20 withr_2.1.2 GenomeInfoDbData_1.1.0
## [124] diptest_0.75-7 doSNOW_1.0.16 hms_0.4.2
## [127] rpart_4.1-13 tidyr_0.8.1 class_7.3-14
## [130] rmarkdown_1.10 segmented_0.5-3.0 Rtsne_0.13
## [133] base64enc_0.1-3
## [1] "Seurat 2.3.4"
Rstudio has a default memory limit of only 1GB. To override this, detect the true memory available and set a new limit.
## Error in loadNamespace(name): there is no package called 'benchmarkme'
## Error in strsplit(RAM, " "): object 'RAM' not found
## Error in eval(quote(list(...)), env): object 'RAM' not found
## Error in loadNamespace(name): there is no package called 'ulimit'
## ! IMPORTANT! Must not setwd to local path when launching on cluster
# setwd("~/Desktop/PD_scRNAseq/")
dir.create(file.path(root,"Data"), showWarnings=F)
load(file.path(root,"Data/seurat_object_add_HTO_ids.Rdata"))
pbmc <- seurat.obj
rm(seurat.obj)## An object of class seurat in project RAJ_13357
## 24914 genes across 22113 samples.
metadata <- read.table(file.path(root,"Data/meta.data4.tsv"))
createDT( metadata, caption = "Metadata") ## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
# Make AgeGroups
makeAgeGroups <- function(){
dim(metadata)
getMaxRound <- function(vals=metadata$Age, unit=10)unit*ceiling((max(vals)/unit))
getMinRound <- function(vals=metadata$Age, unit=10)unit*floor((min(vals)/unit))
ageBreaks = c(seq(getMinRound(), getMaxRound(), by = 10), getMaxRound()+10)
AgeGroupsUniq <- c()
for (i in 1:(length(ageBreaks)-1)){
AgeGroupsUniq <- append(AgeGroupsUniq, paste(ageBreaks[i],ageBreaks[i+1], sep="-"))
}
data.table::setDT(metadata,keep.rownames = T,check.names = F)[, AgeGroups := cut(Age,
breaks = ageBreaks,
right = F,
labels = AgeGroupsUniq,
nclude.lowest=T)]
metadata <- data.frame(metadata)
unique(metadata$AgeGroups)
head(metadata)
dim(metadata)
return(metadata)
}
# metadata <- makeAgeGroups()
pbmc <- AddMetaData(object = pbmc, metadata = metadata)
# Get rid of any NAs (cells that don't match up with the metadata)
if(subsetCells==F){
pbmc <- FilterCells(object = pbmc, subset.names = "nGene", low.thresholds = 0)
} else {pbmc <- FilterCells(object = pbmc, subset.names = "nGene", low.thresholds = 0,
# Subset for testing
cells.use = pbmc@cell.names[0:subsetCells]
)
} ## Warning in as.list.environment(environment(), all = TRUE): NAs introduced
## by coercion
## Error in 0:subsetCells: NA/NaN argument
Include only subsets of genes by type. Biotypes from: https://useast.ensembl.org/info/genome/genebuild/biotypes.html
subsetBiotypes <- function(pbmc, subsetGenes){
if( subsetGenes!=F ){
cat(paste("Subsetting genes:",subsetGenes))
# If the gene_biotypes file exists, import csv. Otherwise, get from biomaRt
if(file_test("-f", file.path(root,"Data/gene_biotypes.csv"))){
biotypes <- read.csv(file.path(root,"Data/gene_biotypes.csv"))
}
else {
ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
dataset="hsapiens_gene_ensembl")
ensembl <- useDataset(mart = ensembl, dataset = "hsapiens_gene_ensembl")
listFilters(ensembl)
listAttributes(ensembl)
biotypes <- getBM(attributes=c("hgnc_symbol", "gene_biotype"), filters="hgnc_symbol",
values=row.names(pbmc@data), mart=ensembl)
write.csv(biotypes, file.path(root,"Data/gene_biotypes.csv"), quote=F, row.names=F)
}
# Subset data by creating new Seurat object (annoying but necessary)
geneSubset <- biotypes[biotypes$gene_biotype==subsetGenes,"hgnc_symbol"]
cat(paste(dim(pbmc@raw.data[geneSubset, ])[1],"/", dim(pbmc@raw.data)[1],
"genes are", subsetGenes))
# Add back into pbmc
subset.matrix <- pbmc@raw.data[geneSubset, ] # Pull the raw expression matrix from the original Seurat object containing only the genes of interest
pbmc_sub <- CreateSeuratObject(subset.matrix) # Create a new Seurat object with just the genes of interest
orig.ident <- row.names(pbmc@meta.data) # Pull the identities from the original Seurat object as a data.frame
pbmc_sub <- AddMetaData(object = pbmc_sub, metadata = pbmc@meta.data) # Add the idents to the meta.data slot
pbmc_sub <- SetAllIdent(object = pbmc_sub, id = "ident") # Assign identities for the new Seurat object
pbmc <- pbmc_sub
rm(list = c("pbmc_sub","geneSubset", "subset.matrix", "orig.ident"))
}
}
subsetBiotypes(pbmc, subsetGenes)## Subsetting genes: protein_coding14827 / 24914 genes are protein_coding
Filter by cells, normalize , filter by gene variability.
** Important!**: * In ScaleData… + Specify do.par = F (unless you have parallel processing set up properly, this will cause your script to crash) + Specify num.cores = nCores (to use all available cores, determined by parallel::detectCores())
Regress out: number of unique transcripts (nUMI), % mitochondrial transcripts (percent.mito)
# Store the top most variable genes in @var.genes
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)# IMPORTANT!: Must set do.par=T and num.cors = n for large datasets being processed on computing clusters
# IMPORTANT!: Use only the var.genes identified by 'FindVariableGenes' as the 'gene.use' arg in 'ScaleData'
## This will greatly reduced the computational load.
# par.Cores <- ifelse(nCores <= 12, 48, nCores)
pbmc <- ScaleData(object = pbmc, genes.use = pbmc@var.genes, vars.to.regress = c("nUMI", "percent.mito"),
do.par = F, num.cores = nCores)## Regressing out: nUMI, percent.mito
## Warning in RegressOutResid(object = object, vars.to.regress =
## vars.to.regress, : For parallel processing, please set do.par to TRUE.
##
## Time Elapsed: 23.1420209407806 secs
## Scaling data matrix
## An object of class seurat in project RAJ_13357
## 24914 genes across 19144 samples.
vp <- VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"),nCol = 3, do.return = T) %>% + ggplot2::aes(alpha=0.5)
vpProjectPCA scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don’t use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection. The results of the projected PCA can be explored by setting use.full=T in the functions above
# Run PCA with only the top most variables genes
pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print=F, verbose=T) #, pcs.print = 1:5, genes.print = 5## Working dimension size 27
## Initializing starting vector v with samples from standard normal distribution.
## Use `set.seed` first for reproducibility.
## irlba: using fast C implementation
# Store in Seurat object so you don't have to recalculate it for the tSNE/UMAP steps
pbmc <- ProjectPCA(object = pbmc, do.print=F) do.hover <-ifelse(interactive==T, T, F)
PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2, do.hover=do.hover, data.hover="mut")# 'PCHeatmap' is a wrapper for heatmap.2
PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced=T, label.columns=F, use.full=F) Determine statistically significant PCs for further analysis. NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time
#pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
PCElbowPlot(object = pbmc)We first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To cluster the cells, we apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function.
On Resolution
NA
# TRY DIFFERENT RESOLUTIONS
pbmc <- StashIdent(object = pbmc, save.name = "pre_clustering")
# pbmc <- SetAllIdent(object = pbmc, id = "pre_clustering")
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
resolution = resolution, print.output = T, save.SNN = T,
n.start = 10, nn.eps = 0.5) ## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 19144
## Number of edges: 769919
##
## Running Louvain algorithm...
## Random start: 1
## Iteration: 1
## Modularity: 0.7613
## Iteration: 2
## Modularity: 0.7681
## Iteration: 3
## Modularity: 0.7681
##
## Random start: 2
## Iteration: 1
## Modularity: 0.7540
## Iteration: 2
## Modularity: 0.7699
## Iteration: 3
## Modularity: 0.7699
##
## Random start: 3
## Iteration: 1
## Modularity: 0.7661
## Iteration: 2
## Modularity: 0.7707
## Iteration: 3
## Modularity: 0.7707
##
## Random start: 4
## Iteration: 1
## Modularity: 0.7603
## Iteration: 2
## Modularity: 0.7683
## Iteration: 3
## Modularity: 0.7683
##
## Random start: 5
## Iteration: 1
## Modularity: 0.7594
## Iteration: 2
## Modularity: 0.7664
## Iteration: 3
## Modularity: 0.7664
##
## Random start: 6
## Iteration: 1
## Modularity: 0.7620
## Iteration: 2
## Modularity: 0.7695
## Iteration: 3
## Modularity: 0.7701
## Iteration: 4
## Modularity: 0.7701
##
## Random start: 7
## Iteration: 1
## Modularity: 0.7602
## Iteration: 2
## Modularity: 0.7684
## Iteration: 3
## Modularity: 0.7684
##
## Random start: 8
## Iteration: 1
## Modularity: 0.7598
## Iteration: 2
## Modularity: 0.7652
## Iteration: 3
## Modularity: 0.7652
##
## Random start: 9
## Iteration: 1
## Modularity: 0.7621
## Iteration: 2
## Modularity: 0.7679
## Iteration: 3
## Modularity: 0.7679
##
## Random start: 10
## Iteration: 1
## Modularity: 0.7530
## Iteration: 2
## Modularity: 0.7702
## Iteration: 3
## Modularity: 0.7702
##
## Maximum modularity in 10 random starts: 0.7707
## Number of communities: 6
## Elapsed time: 9 seconds
## Parameters used in latest FindClusters calculation run on: 2019-01-16 13:42:52
## =============================================================================
## Resolution: 0.4
## -----------------------------------------------------------------------------
## Modularity Function Algorithm n.start n.iter
## 1 1 10 10
## -----------------------------------------------------------------------------
## Reduction used k.param prune.SNN
## pca 30 0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10
Additional UMAP arguments detailed here: https://umap-learn.readthedocs.io/en/latest/api.html#module-umap.umap_
As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument.
** Important!**: Specify num_threads=0 in ‘RunTSNE’ to use all available cores.
“FItSNE”, a new fast implementation of t-SNE, is also available through RunTSNE. However FItSNE must first be setup on your computer.
labSize <- 6
pbmc <- RunTSNE(object=pbmc, reduction.use = "pca", dims.use = 1:10, do.fast = TRUE,
tsne.method = "Rtsne", num_threads=nCores, verbose=T) # FItSNE## Read the 19144 x 10 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
## - point 0 of 19144
## - point 10000 of 19144
## Done in 9.44 seconds (sparsity = 0.006869)!
## Learning embedding...
## Iteration 50: error is 104.835938 (50 iterations in 19.62 seconds)
## Iteration 100: error is 104.709901 (50 iterations in 25.21 seconds)
## Iteration 150: error is 101.632348 (50 iterations in 21.25 seconds)
## Iteration 200: error is 101.580125 (50 iterations in 22.61 seconds)
## Iteration 250: error is 101.472564 (50 iterations in 20.42 seconds)
## Iteration 300: error is 4.413932 (50 iterations in 13.54 seconds)
## Iteration 350: error is 4.166479 (50 iterations in 14.89 seconds)
## Iteration 400: error is 4.016408 (50 iterations in 15.17 seconds)
## Iteration 450: error is 3.905556 (50 iterations in 15.98 seconds)
## Iteration 500: error is 3.818781 (50 iterations in 16.77 seconds)
## Iteration 550: error is 3.750168 (50 iterations in 16.79 seconds)
## Iteration 600: error is 3.693278 (50 iterations in 16.65 seconds)
## Iteration 650: error is 3.645445 (50 iterations in 16.63 seconds)
## Iteration 700: error is 3.604156 (50 iterations in 16.93 seconds)
## Iteration 750: error is 3.568189 (50 iterations in 16.37 seconds)
## Iteration 800: error is 3.536739 (50 iterations in 17.20 seconds)
## Iteration 850: error is 3.509031 (50 iterations in 17.36 seconds)
## Iteration 900: error is 3.484408 (50 iterations in 17.27 seconds)
## Iteration 950: error is 3.462351 (50 iterations in 17.38 seconds)
## Iteration 1000: error is 3.443001 (50 iterations in 17.65 seconds)
## Fitting performed in 355.69 seconds.
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = pbmc, do.label=T, label.size = labSize, do.return=T) tSNE_metadata_plot <- function(var){
cat(paste("t-SNE Metadata plot for ", var))
# Metadata plot
p1 <- TSNEPlot(pbmc, do.return = T, do.label = T, group.by = var, pt.size=1,
plot.title=paste("Color by ",var), vector.friendly=T) + theme(legend.position = "top")
# layout(legend = list(orientation = 'h', xanchor = "center", x = 0.5, y = .999))
# t-SNE clusters plot
p2 <- TSNEPlot(pbmc, do.return = T, do.label = T, pt.size=1,
plot.title=paste("Color by Clusters"), vector.friendly=T) + theme(legend.position = "top")
# layout(legend = list(orientation = 'h', xanchor = "center", x = 0.5, y = .999))
#print(plot_grid(ggplotly(p1), ggplotly(p2)))
if(interactive==T){
fluidPage(
fluidRow(
column(6, p1), column(6, p2)
)
)
} else{return(plot_grid(p1,p2))}
}
# metaVars <- c(dx","mut","Gender","Age")
#
# for (var in metaVars){
# print(paste("t-SNE Metadata plot for ",var))
# # Metadata plot
# p1 <- TSNEPlot(pbmc, do.return = T, pt.size = 0.5, group.by = var, do.label = T,
# dark.theme=F, plot.title=paste("Color by ",var))
# # t-SNE clusters plot
# p2 <- TSNEPlot(pbmc, do.label = T, do.return = T, pt.size = 0.5, plot.title=paste("Color by t-SNE clusters"))
# print(plot_grid(p1, p2))
# } ## t-SNE Metadata plot for Age
### Individual ID
## t-SNE Metadata plot for ID
NA
Shown here: Biomarkers of each cluster vs. all other clusters.
# Limit to only top variable genes?:
# Set arg 'only.pos=F' to capture negative biomarkers
pbmc.markers <- FindAllMarkers(object = pbmc, min.pct = 0.25, thresh.use = 0.25, only.pos = F, test.use = "wilcox")
pbmc.markers <- pbmc.markers %>% mutate(FC = 2^avg_logFC)
createDT(pbmc.markers, caption = paste("All Biomarkers: All Clusters"))getTopBiomarker <- function(pbmc.markers, clusterID, topN=1){
df <-pbmc.markers %>%
subset(p_val_adj<0.05 & cluster==as.character(clusterID)) %>%
arrange(desc(avg_logFC))
top_pct_markers <- df[1:topN,"gene"]
return(top_pct_markers)
}
# clust1_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=1, topN=2)
# clust2_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=2, topN=2)
### Plot biomarkers
plotBiomarkers <- function(pbmc, biomarkers, cluster){
biomarkerPlots <- list()
for (marker in biomarkers){
p <- VlnPlot(object = pbmc, features.plot = c(marker), y.log=T, return.plotlist=T)
biomarkerPlots[[marker]] <- p + ggplot2::aes(alpha=0.5) + xlab( "Cluster") + ylab( "Expression")
}
combinedPlot <- do.call(grid.arrange, c(biomarkerPlots, list(ncol=2, top=paste("Top DEG Biomarkers for Cluster",cluster))) )
# biomarkerPlots <- lapply(biomarkers, function(marker) {
# VlnPlot(object = pbmc, features.plot = c(marker), y.log=T, return.plotlist=T) %>% + ggplot2::ggtitle(marker) %>% ggplotly()
# })
# return(subplot(biomarkerPlots) )
}
top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
nCols <- floor( sqrt(length(unique(top1$cluster))) )
figHeight <- nCols *7
# Plot top 2 biomarker genes for each
for (clust in unique(pbmc.markers$cluster)){
cat('\n')
cat("### Cluster ",clust,"\n")
biomarkers <- getTopBiomarker(pbmc.markers, clusterID=clust, topN=2)
plotBiomarkers(pbmc, biomarkers, clust)
cat('\n')
} ##Construct the plot object
volcanoPlot <- function(DEG_df, caption="", topFC_labeled=5){
DEG_df$sig<- ifelse( DEG_df$p_val_adj<0.05 & DEG_df$avg_logFC<1.5, "p_val_adj<0.05",
ifelse( DEG_df$p_val_adj<0.05 & DEG_df$avg_logFC>1.5, "p_val_adj<0.05 & avg_logFC>1.5",
"p_val_adj>0.05"
))
DEG_df <- arrange(DEG_df, desc(sig))
vol <- ggplot(data=DEG_df, aes(x=avg_logFC, y= -log10(p_val_adj))) +
geom_point(alpha=0.5, size=3, aes(col=sig)) +
scale_color_manual(values=list("p_val_adj<0.05"="turquoise3",
"p_val_adj<0.05 & avg_logFC>1.5"="purple",
"p_val_adj>0.05" = "darkgray")) +
theme(legend.position = "none") +
xlab(expression(paste("Average ",log^{2},"(fold change)"))) +
ylab(expression(paste(-log^{10},"(p-value)"))) + xlim(-2,2) +
## ggrepl labels
geom_text_repel(data= arrange(DEG_df, p_val_adj, desc(avg_logFC))[1:topFC_labeled,],
# filter(DEG_df, avg_logFC>=1.5)[1:10,],
aes(label=gene), color="black", alpha=.5,
segment.color="black", segment.alpha=.5
) +
# Lines
geom_vline(xintercept= -1.5,lty=4, lwd=.3, alpha=.5) +
geom_vline(xintercept= 1.5,lty=4, lwd=.3, alpha=.5) +
geom_hline(yintercept= -log10(0.05),lty=4, lwd=.3, alpha=.5) +
ggtitle(caption)
print(vol)
}
for (clust in unique(pbmc.markers$cluster)){
cat('\n')
cat("### Cluster ",clust,": Volcano")
cap <- paste("Cluster",clust,"DEG Table")
DEG_df <- subset(pbmc.markers, cluster==as.character(clust)) %>% arrange(desc(avg_logFC))
volcanoPlot(DEG_df, caption = cap)
createDT(DEG_df, caption = cap)
cat('\n')
}##
## ### Cluster 0 : Volcano
##
##
## ### Cluster 1 : Volcano
##
##
## ### Cluster 2 : Volcano
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
##
##
## ### Cluster 3 : Volcano
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
##
##
## ### Cluster 4 : Volcano
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
##
##
## ### Cluster 5 : Volcano
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_text_repel).
fp <- FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("grey", "purple"),
reduction.use = "tsne", nCol = nCols, do.return = T)top5 <- pbmc.markers %>% group_by(cluster) %>% top_n(5, avg_logFC)
# setting slim.col.label to TRUE will print just the cluster IDS instead of
# every cell name
DoHeatmap(object = pbmc, genes.use = top5$gene, slim.col.label=T, remove.key=T) ## Picking joint bandwidth of 0.207
## Picking joint bandwidth of 0.33
## Picking joint bandwidth of 0.0337
## Picking joint bandwidth of 0.0245
## Picking joint bandwidth of 0.0541
## Picking joint bandwidth of 0.139
Visualize biomarker expression for each cluster, by disease
top2 <- pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
sdp <- SplitDotPlotGG(pbmc, genes.plot = top2$gene, cols.use = c("blue","red"),
x.lab.rot = T, plot.legend = T, dot.scale = 8, do.return = T, grouping.var = "dx")The following plots show the absolute expression of each biomarker, as opposed to avg_logFC which is dependent on the expression patterns of other cell types being compared.
markerList <- c("CD14", "FCGR3A")
get_markerDF <- function(pbmc, markerList, meta_vars =c("barcode", "dx", "mut","post_clustering", "percent.mito","nGene", "nUMI")){
exp <- pbmc@scale.data %>% data.frame()
marker.matrix <- exp[row.names(exp) %in% markerList, ]
marker.matrix$Gene <- row.names(marker.matrix)
markerMelt <- reshape2:::melt.data.frame(marker.matrix, id.vars = "Gene", variable.name = "Cell",value.name = "Expression")
metaSelect <- pbmc@meta.data[,meta_vars]
markerDF <- merge(markerMelt,metaSelect, by.x="Cell", by.y="barcode")
return(markerDF)
}
markerDF <- get_markerDF(pbmc, markerList)
createDT(markerDF, caption = "Known Marker Expression")## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
# Explore expression differences between groups
marker_vs_metadata <- function(markerDF, meta_var){
# Create title from ANOVA summary
ANOVAtitle <- function(markerDF, marker){
nTests <- length(unique(markerDF$Gene))
res <- anova(lm(data = subset(markerDF, Gene==marker),
formula = Expression ~ eval(parse(text=meta_var))))
title <-paste(paste("ANOVA (",marker, " vs. ",meta_var, ")", sep=""),
": p=",round(res$`Pr(>F)`,3),
", F=",round(res$`F value`,3),
ifelse(res$`Pr(>F)`<.05/nTests,"(Significant**)",
"(Non-significant)") )
}
title = ""
for (marker in unique(markerDF$Gene) ){
cat(marker)
title <- paste(title, "\n", ANOVAtitle(markerDF, marker))
}
ggplot(markerDF, aes(x=eval(parse(text=meta_var)), y=Expression, fill= Gene)) +
geom_boxplot() +
labs(title = title, x=meta_var) +
theme(plot.title = element_text( size=10)) +
scale_fill_manual(values=c("brown", "slategray"))
}identify_cellTypes_by_biomarkers <- function(pbmc.markers, topN_search=5){
top <- pbmc.markers %>% group_by(cluster) %>% top_n(topN_search, avg_logFC)
clust_cellTypes <- list()
for (clust in top$cluster){
clustSub <- top[top5$cluster==clust, ]
CD16_logFC <- subset(clustSub, gene=="CFD")$avg_logFC
cellType <- ifelse(sum(markerList %in% clustSub$gene), # Both CD14 and CD16? Great, keep going
ifelse(CD16_logFC == abs(CD16_logFC), "CD14++/CD16+", # But does CD16 have pos logfC? If so, then it's "CD14++/CD16+"
"CD14++/CD16--"), # Otherwise, it means it means CD16 logFC is neg, meaning "CD14++/CD16--"
NA) # If it's none of these, it's an undefined cell type
clust_cellTypes[clust] <- as.factor(cellType)
}
newMeta <- pbmc@meta.data
newMeta["CellType_DGE"] <- plyr::mapvalues(newMeta$post_clustering, names(clust_cellTypes), as.character(clust_cellTypes) )
pbmc <- AddMetaData(pbmc, metadata = newMeta)
return(pbmc)
}
pbmc <- identify_cellTypes_by_biomarkers(pbmc.markers, 5)
# (Doesn't make sense to do bar plot because whole clusters are defined by their biomarkers)
tSNE_metadata_plot("CellType_DGE")## t-SNE Metadata plot for CellType_DGE
# A simplistic way of categorizing cells into CD14++/CD16+ and CD14++/CD16--,
## is by splitting cells into groups based on whether their expression is
## higher or lower than the average CD16 expression of all cells.
identify_cellTypes_by_avgExpression <- function(pbmc, markerDF){
avgMarkerExp <-markerDF %>% group_by(Gene) %>% dplyr::summarise(meanExp = mean(Expression))
avgMarkerExp <- setNames(avgMarkerExp$meanExp, avgMarkerExp$Gene)
CD16 <- markerDF[markerDF$Gene=="FCGR3A",]
CD16_group <- ifelse(CD16$Expression >= avgMarkerExp["FCGR3A"], "CD14++/CD16+", "CD14++/CD16--")
CD16["CellType_AvgExp"] <- as.factor(CD16_group)
# Make sure row order is same before putting back into meta.data
metaD <- pbmc@meta.data
newMeta <- merge(metaD, CD16[,c("Cell","CellType_AvgExp")], by.x="barcode", by.y="Cell")
row.names(newMeta) <- row.names(metaD)
pbmc <- AddMetaData(pbmc, metadata = newMeta)
return(pbmc)
}
pbmc <- identify_cellTypes_by_avgExpression(pbmc, markerDF)
# Get proportions of cell types in each cluster
cluster_proportions <- pbmc@meta.data %>% group_by(CellType_AvgExp, post_clustering) %>%
tally() %>%
group_by(post_clustering, CellType_AvgExp) %>%
mutate(percentTotal = n/sum(n)*100)
ggplot(cluster_proportions, aes(x=post_clustering, y=percentTotal, fill=CellType_AvgExp)) + geom_col(position = "fill") +
labs(title="Proportions of Cell-types per Cluster: \n CellType_AvgExp",
x="Cluster", y="Cell Type / Total Cells") +
scale_fill_manual(values=c("brown", "slategray"))## t-SNE Metadata plot for CellType_AvgExp
markerDF <- markerDF %>% mutate(Cluster = post_clustering)
# Show mean exp for each marker
avgMarker <- markerDF %>% group_by(Gene, Cluster) %>% summarise(meanExp = mean(Expression))
p <- ggplot(data = avgMarker, aes(x=Gene, y=Cluster, fill=meanExp)) %>% + geom_tile() + scale_fill_viridis()
p# Show mean exp for each marker
avgMarker <- markerDF %>% group_by(Gene, dx, Cluster) %>% summarise(meanExp = mean(Expression))
p <- ggplot(data = avgMarker, aes(x=Gene, y=dx, fill=meanExp)) %>% + geom_tile() + scale_fill_viridis()
pmarkerDF <- get_markerDF(pbmc, markerList,
meta_vars =c("barcode", "dx", "mut","ID","post_clustering", "percent.mito","nGene", "nUMI",
"CellType_DGE","CellType_AvgExp"))
Spectral <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(length(unique(pbmc@meta.data$mut)), "Spectral"))
if (interactive==T){
# Spectral <- heatmaply::Spectral(length(unique(pbmc@meta.data$mut)))
markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0)
heatmaply::heatmaply(markerMelt, key.title="Expression",#plot_method= "ggplot",
k_row = dim(markerMelt)[2], dendrogram = "row",
showticklabels = c(T, F), xlab = "Known Markers", ylab = "Cells", column_text_angle = 45,
row_side_colors = pbmc@meta.data[,c("dx","mut", "CellType_DGE")], row_side_palette = Spectral
) %>% colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 2) %>%
colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 1)
}else{
# markerDF_sub <-subset(markerDF, Gene==markerList[1])
# var_to_colors(markerDF_sub, "post_clustering")
# library(pheatmap)
# pheatmap(markerMelt, annotation_row = markerDF_sub[c("dx","mut","CellType_DGE", "CellType_AvgExp")])
# pheatmap(markerMelt, kmeans_k = NA, annotation_row = markerDF_sub[c("dx","mut","CellType_DGE", "CellType_AvgExp")],
# cluster_cols = F, cutree_rows = length(unique(markerDF$post_clustering)), angle_col=45 )
library(RColorBrewer)
var_to_colors <- function(markerDF, metaVar){
colors <- brewer.pal(length(unique(markerDF[metaVar]) ), "Dark2")
sample(colors, length(unique(markerDF[metaVar])), replace = TRUE, prob = NULL)
# metaColors <- colors[ subset(markerDF, Gene==markerList[1])[metaVar][,1] %>% as.factor() ]
return(metaColors)
}
# library(GMD)
# myCols = cbind(var_to_colors(markerDF, "dx"), var_to_colors(markerDF, "mut"))
# rlab=t(cbind(
# var_to_colors(markerDF, "post_clustering"),
# var_to_colors(markerDF, "dx")
# ))
# heatmap.2(marker.matrix, key.title="Expression", col = viridis(300), trace="none",Colv = F, Rowv = F,
# labRow = F, xlab = "Biomarker", ylab="Cell", cexCol=1, RowSideColors = var_to_colors(markerDF, "post_clustering")
# )
# heatmap.3(markerMelt, dendrogram = 'row', kr = length(unique(markerDF)), labRow = F,
# xlab = "Biomarker", ylab = "Cell", RowSideColors = rlab, RowSideColorsSize=2 )
markerDF <- markerDF %>%
mutate_at(vars(post_clustering, dx, mut, ID, CellType_DGE, CellType_AvgExp), as.factor) %>%
mutate(Cluster = post_clustering) %>%
arrange(post_clustering)
# markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0)
markerMelt <- dcast(markerDF, Cell + post_clustering + dx + mut + ID + CellType_DGE + CellType_AvgExp ~ Gene,
fun.aggregate = mean, value.var = "Expression") %>% arrange(post_clustering)
marker.matrix <- markerMelt[markerList] %>%as.matrix()
row.names(marker.matrix) <- markerMelt$Cell
ha = HeatmapAnnotation(df = markerDF[c("dx","mut","ID","CellType_DGE","CellType_AvgExp")], which = "row")
ComplexHeatmap::Heatmap(marker.matrix, col=viridis(300), column_title = "Biomarker", row_title = "Cell",
row_dend_reorder = F,show_row_names = F, show_column_dend = F,show_row_dend =T,
cluster_rows = T, column_title_side = "bottom",km = length(unique(markerMelt$post_clustering))) + ha
} ## Error in `[.data.frame`(markerMelt, markerList): undefined columns selected
ggplot(data = markerDF, aes(x=Cluster, y=Expression, fill=Gene)) %>%
+ geom_boxplot(alpha=0.5) %>% + scale_fill_manual(values=c("purple", "turquoise")) #, results = 'hide', fig.show='hide'
expressionTSNE <- function(pbmc, marker, colors=c("grey", "red")){
FeaturePlot(object = pbmc, features.plot = marker, cols.use = colors,
reduction.use = "tsne", nCol=2, do.return = T, dark.theme = T)[[1]]
# p <- ifelse(interactive, p %>% ggplotly() %>% toWebGL(), print(p))
}
plot_grid(expressionTSNE(pbmc, markerList[1]),
expressionTSNE(pbmc, markerList[2], colors=c("grey", "green")))current.cluster.ids <- unique(pbmc.markers$cluster) #c(0, 1, 2, 3, 4, 5, 6, 7)
top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
new.cluster.ids <- top1$gene #c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(object=pbmc, do.label=T, pt.size=0.5, do.return=T) # Available DGE methods:
## "wilcox", "bimod", "roc", "t", "tobit", "poisson", "negbinom", "MAST", "DESeq2"
runDGE <- function(pbmc, meta_var, group1, group2, test.use="wilcox"){
#print(paste("DGE_allCells",meta_var,sep="_"))
pbmc <- SetAllIdent(pbmc, id = meta_var)
pbmc <- StashIdent(pbmc, save.name = meta_var)
DEGs <- FindMarkers(pbmc, ident.1=group1, ident.2=group2, test.use=test.use)
DEGs$gene <- row.names(DEGs)
cap <- paste("DEGs:\n",group1, "vs.", group2)
createDT(DEG_df, caption = cap)
volcanoPlot(DEG_df, caption = cap)
pbmc <- SetAllIdent(pbmc, id = "post_clustering")
return(DEGs)
}## Error in FindMarkers(pbmc, ident.1 = group1, ident.2 = group2, test.use = test.use): No genes pass logfc.threshold threshold
## Error in FindMarkers(pbmc, ident.1 = group1, ident.2 = group2, test.use = test.use): No genes pass logfc.threshold threshold
## Error in WhichCells(object = object, ident = ident.1): Identity : CD14++/CD16+ not found.
## Error in FindMarkers(pbmc, ident.1 = group1, ident.2 = group2, test.use = test.use): No genes pass logfc.threshold threshold
DGE_within_clusters <- function(pbmc, meta_var, group1, group2){
for (clust in unique(pbmc@meta.data$post_clustering)){
# Subset cells by cluster
pbmc_clustSub <- Seurat::SubsetData(pbmc, accept.value = clust, subset.raw = T)
cap <- paste("Cluster ",clust,": \n ",group1," vs. ", group2, sep="")
cat('\n')
cat("### ",cap)
DEG_df <-runDGE(pbmc_clustSub, meta_var, group1, group2 )
cat('\n')
}
}##
## ### Cluster 0:
## PD vs. control
## Error in FindMarkers(pbmc, ident.1 = group1, ident.2 = group2, test.use = test.use): No genes pass logfc.threshold threshold
##
## ### Cluster 0:
## LRRK2 vs. PD
## Error in FindMarkers(pbmc, ident.1 = group1, ident.2 = group2, test.use = test.use): No genes pass logfc.threshold threshold
##
## ### Cluster 0:
## CD14++/CD16+ vs. CD14++/CD16--
## Error in WhichCells(object = object, ident = ident.1): Identity : CD14++/CD16+ not found.
##
## ### Cluster 0:
## CD14++/CD16+ vs. CD14++/CD16--
## Error in FindMarkers(pbmc, ident.1 = group1, ident.2 = group2, test.use = test.use): No genes pass logfc.threshold threshold
If you perturb some of our parameter choices above (for example, setting resolution=0.8 or changing the number of PCs), you might see the CD4 T cells subdivide into two groups. You can explore this subdivision to find markers separating the two T cell subsets. However, before reclustering (which will overwrite object@ident), we can stash our renamed identities to be easily recovered later.
new_resolution <- 3.0
orig_resolution <- paste("resolution",params$resolution,sep="_")
pbmc <- StashIdent(object = pbmc, save.name = orig_resolution)
## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Build parameters exactly match those of already
## computed and stored SNN. To force recalculation, set force.recalc to TRUE.
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
resolution = new_resolution, print.output = F)
pbmc <- StashIdent(object = pbmc, save.name = "resolution_3.0")
plot1 <- TSNEPlot(object = pbmc, do.return = TRUE, no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE, group.by = "ClusterNames_0.6",
no.legend = TRUE, do.label = TRUE, label.size=labSize)## Error in FetchData(object = object, vars.all = group.by): Error: ClusterNames_0.6 not found
## Error in plot_grid(plot1, plot2): object 'plot2' not found